

#####################################################
###### 02_text_apicalls.R
###### Makes API calls to LLMs to code open-ended 
###### text responses
###### Note: LLM responses are non-deterministic 
###### This code was used to produce 
###### llm_results_test.RDS; llm_results_train.RDS;
###### llm_results_unlab.RDS (included in rep package)
#####################################################

library(httr)
library(jsonlite)
library(dplyr)
library(tidyr)
library(purrr)
library(furrr)
library(here)
library(stringr)
library(claudeR)
library(openai)

######################
## Load Survey Data ##
######################

## Load data survey responses
df_all <- read.csv(here("data/analytic_df.csv"))

## Filter to only parents
df_paronly <- df_all %>% filter(!derived_parent %in% c("Other", "Never parent")) %>%
  mutate(derived_statusquo_cond = relevel(as.factor(derived_statusquo_cond), 
                                          ref = "Weighted lottery"),
         # This is the open-text response for why the preferred choice 
         derived_binary_freeresp = ifelse(derived_binary_freeresp %in% c("77", "98", "99"), 
                                          NA_character_,
                                          derived_binary_freeresp),
         derived_cell = sprintf("%s_%s_%s", derived_statusquo_cond, 
                                ifelse(derived_alg_morefair, 
                                       "algfairer", "otherfairer"),
                                ifelse(derived_changeview, 
                                       "changeview", "sameview")),
         derived_fr_nword = str_count(trimws(derived_binary_freeresp), 
                                      pattern = "\\W+")+1)   

# Read in hand-coded observations
combined_qual <- read.csv(here("data/fr_cleaned_10212025.csv"))

# Note that did not filter out inadequate_response_tocode
combined_qual_valid <- combined_qual %>% filter(coded. == "Yes") %>%  
  mutate(condition = gsub("\\_algfairer|\\_otherfairer", "", limited_cell),
         choice = ifelse(grepl("algfairer", limited_cell), "Alg. fairer", 
                         "Other fairer"))

rm(combined_qual)

par_joined <- df_paronly %>%
  inner_join(combined_qual_valid %>%
               select(-starts_with("derived")), "CaseId")

################
# Set up tests #
################

# Clean key variables
par_joined_clean <- par_joined %>%
  select(CaseId, derived_compcheck, derived_binary_freeresp, 
         limited_cell, inadequate_response_tocode,
         salience_targeting, valence_targeted_v_not,
         salience_impersonal_personal, valence_impersonal_v_personal) %>%
  # Reformat variables for easier evaluation later
  mutate(inadequate_response_tocode_bin = ifelse(inadequate_response_tocode == "Yes", 1, 0),
         salience_target_truth = case_when(salience_targeting == "Salient" ~ 1,
                                         inadequate_response_tocode == "Yes" ~ NA,
                                         TRUE ~ 0),
         salience_impersonal_truth = case_when(salience_impersonal_personal == "Salient" ~ 1, 
                                             inadequate_response_tocode == "Yes" ~ NA,
                                             TRUE ~ 0)) %>%
  mutate(valence_impersonal_truth = case_when(is.na(salience_impersonal_truth) ~ "N/A (not salient)",
                                              valence_impersonal_v_personal == "Other (salient but unclear)" ~ "Other (salient but unclear)",
                                        valence_impersonal_v_personal %in%
                                          c("Impersonal is bad",
                                            "Personal is good",
                                            "Personal is good + impersonal is bad") ~ "Impersonal is bad",
                                        TRUE ~ "Impersonal is good")) %>%
  rename(valence_target_truth = valence_targeted_v_not) %>%
  separate(limited_cell, into = c("status_quo", "choice"), sep = "_") %>%
  mutate(status_quo_text = case_when(status_quo == "Counselor discretion" ~ "the school counselor's judgment",
                                     status_quo == "Parent requests" ~ "parent requests as they are made",
                                     status_quo == "Simple rule" ~ "a test score and family income cutoff",
                                     status_quo == "Weighted lottery" ~ "a lottery with higher odds for some students")) %>%
  mutate(preference_desc = ifelse(choice == "algfairer", paste0("algorithms are fairer than ", status_quo_text),
                                  paste0("algorithms are less fair than ", status_quo_text))) %>%
  mutate(preferred_option = ifelse(choice == "algfairer", 
                                   "a predictive model", status_quo_text),
         notpreferred_option = ifelse(choice == "algfairer", 
                                      status_quo_text, "a predictive model")) %>%
  # Also do some additional cleanup to make things consistent
  # In case missing valence 
  # Patches earlier case_when
  mutate(valence_target_truth = ifelse(salience_target_truth == 0, 
                                       "N/A", valence_target_truth),
         valence_impersonal_truth = ifelse(salience_impersonal_truth == 0, 
                                           "N/A", valence_impersonal_truth))

# Handle responses that didn't have enough info to code
# Treat inadequate to code as 0's for salience and N/A's for valence 
par_joined_s1 <- par_joined_clean %>%
  select(CaseId, inadequate_response_tocode_bin, 
         contains("truth")) %>%
  mutate(across(contains("valence"), 
                ~ ifelse(inadequate_response_tocode_bin == 1, "N/A", .)),
         across(contains("salience"), 
                ~ ifelse(inadequate_response_tocode_bin == 1, 0, .)))  %>%
  mutate(across(contains("salience"), ~ factor(.x, levels = c(1, 0)))) 

par_joined_select <- par_joined_clean 

# Train-test split
set.seed(1001)
train_rand <- sample(1:nrow(par_joined_select), 
                     round(nrow(par_joined_select) * .8))
par_train <- par_joined_select[train_rand,]
par_test <- par_joined_select[-train_rand,]

# Set up prompt
set_promptB_1 <- "You will analyze a survey response. The respondent was previously asked if they preferred a predictive algorithm or status quo method for allocating tutors to students. The respondent indicated they thought that " 
set_promptB_2 <- ". The response you will review was their answer to the follow-up question 'Explain why you think "
set_promptB_2b <- " is fairer than" 
set_promptB_3 <- "'. Based on the response, answer the questions below. We'll do this in steps. 

Step 1: Analyze salience of resource targeting
Q1: Is targeting of resources a salient concern? Classify text based on whether it implicitly or explicitly expresses concerns about how or the degree to which resources are targeted to individual students rather than being universally or equally provided to all students.
Answer 'yes' if the response mentions or implies:
* Targeting or allocation of tutors to specific students/groups
* Criteria for selecting students for support
* Targeted approaches or universal access
* Factors associated with student success or need
* Timeliness or relevance of information used to allocate 
* Concerns about how tutors are targeted are implied through mentions of personal knowledge, judgment, bias, objectivity, or the unique needs of students as factors in fair or effective allocation.

Step 2: If Q1 is Yes, determine the valence of targeting
Q1b: If Q1 is yes, what is the respondent's view on finer-grained targeting? If you answered no to Q1, answer N/A (not salient). 
  Options: [More targeting is good, Less targeting is good, Other (salient but unclear), N/A (not salient)]
  
Definitions:
* More targeting is good: The respondent supports allocating resources more precisely to individual students based on specific needs, abilities, or circumstances. This includes advocating for personalized assessment by parents, counselors, or teachers to identify students who need help, as they can provide sensitive and timely insights that algorithms might miss.
* Less targeting is good: The respondent prefers resources to be allocated universally or equally among students, or opposes targeting resources to specific students or groups. This includes advocating against the use of specific criteria or assessments to determine who receives additional resources.
* Other (salient but unclear): The respondent discusses resource targeting, but it is unclear whether they support more or less targeting.
* N/A (not salient): The respondent does not discuss resource targeting.

Step 3: analyze salience of personal vs. impersonal allocation
Q2: Is the personal vs. impersonal nature of the allocation method a salient concern?
Answer 'yes' if the response explicitly mentions or implies:
* Objectivity, subjectivity, personal bias, or favoritism
* Consideration of individual personal circumstances
* Role of personal knowledge vs. reliance on standardized measures or data-driven criteria
* Quantitative vs. qualitative information, especially if the latter involves personalized insight
* Discusses personal relationships, personal knowledge, or the human judgment of parents, teachers, or counselors
* Fairness of personal attention vs. systematic or algorithmic allocation
* Ability to account for unique student characteristics that standardized methods might overlook.

Step 4: If Q2 is Yes, determine the valence of impersonality
Q2b: If Q2 is yes, what is the respondent's view on greater impersonality? If you answered no to Q2, answer N/A (not salient)
Options: [Impersonal is bad, Impersonal is good, Other (salient but unclear), N/A (not salient)]

Definitions:
* Impersonal is good: The respondent supports using impersonal, standardized, or data-driven methods for allocating resources, even if they suggest supplementing with personal input.
* Impersonal is bad: The respondent prefers personal, human-based assessments over impersonal methods for allocation.
* Other (salient but unclear): The respondent discusses impersonality, but it is unclear if they prefer more personal or impersonal approaches.
* N/A (not salient): The respondent does not address the personal vs. impersonal nature of either method. 

Step 5: Format your response
Format your response in the following way (do not provide any additional text): Q1: {1 if yes, 0 if no}; 
Q1b: {More targeting is good, Less targeting is good, Other (salient but unclear), N/A (not salient)};
Q2: 1 if yes, 0 if no};
Q2b: {Impersonal is bad, Impersonal is good, Other (salient but unclear), N/A (not salient)}.

Now, analyze the following response: "

par_train_prompts <- par_train %>%
  mutate(prompt_B = paste(set_promptB_1, tolower(preference_desc), 
                          set_promptB_2, preferred_option,
                          set_promptB_2b, notpreferred_option, set_promptB_3,
                          "<survey response>", derived_binary_freeresp,
                          "</survey response>"))

par_test_prompts <- par_test %>%
  mutate(prompt_B = paste(set_promptB_1, tolower(preference_desc), 
                          set_promptB_2, preferred_option,
                          set_promptB_2b, notpreferred_option, set_promptB_3,
                          "<survey response>", derived_binary_freeresp,
                          "</survey response>"))

##################
# API call functions
##################

# Set up parallelization
plan(multisession, workers = 4)

# For Claude
# Need to fill API key
# Note LLM results not deterministic 
get_response_claude <- function(
                                id, text, temperature, model_name, 
                                max_retries = 1, retry_delay = 1) {
  attempt <- 1
  while (attempt <= max_retries) {
    tryCatch({
      response <- claudeR(
        prompt = list(
          list(
            role = "user", 
            content = paste0(text)
          )
        ), 
        model = model_name, 
        max_tokens = 4096, 
        temperature = temperature,
        api_key = "API-KEY-HERE"
      )
      
      # If successful, return the response
      return(c(llm_response = response, error = NA))
    },
    error = function(e) {
      if (attempt == max_retries) {
        # If this was the last attempt, return the error
        return(c(llm_response = NA, error = as.character(e)))
      } else {
        # If not the last attempt, print a message and continue
        message(sprintf("Attempt %d failed. Retrying in %d seconds...", 
                        attempt, retry_delay))
        Sys.sleep(retry_delay)
        attempt <<- attempt + 1
      }
    })
  }
}

# For GPT
# Need to fill out API key 
# Note LLM results not deterministic 
get_response_gpt <- function(
  id, text, temperature, model_name, 
  max_retries = 1, retry_delay = 1) {
  attempt <- 1
  while (attempt <= max_retries) {
    tryCatch({
      response <- create_chat_completion(
        model = model_name, 
        message = list(list("role" = "user", 
                            "content" = paste0(text))),
        max_tokens = 4096, 
        temperature = temperature,
        openai_api_key = "API KEY HERE"
      )
      # If successful, return the response
      return(c(llm_response = response$choices$message.content, error = NA))
    },
    error = function(e) {
      if (attempt == max_retries) {
        # If this was the last attempt, return the error
        return(c(llm_response = NA, error = as.character(e)))
      } else {
        # If not the last attempt, print a message and continue
        message(sprintf("Attempt %d failed. Retrying in %d seconds...", 
                        attempt, retry_delay))
        Sys.sleep(retry_delay)
        attempt <<- attempt + 1
      }
    })
  }
}


##############################
# Make API calls on training #
##############################
set_gpt_mod <-"gpt-4o-mini-2024-07-18" #"gpt-4o-2024-08-06" # 
set_claude_mod <-  "claude-3-5-sonnet-20240620" # "claude-3-opus-20240229"  

results_train_claude <- par_train_prompts %>%
  select(CaseId, prompt_B, ends_with("_bin"), ends_with("_truth")) |>
  mutate(
    result = future_pmap(
      list(
        id = CaseId,
        text = prompt_B
      ),
      ~ get_response_claude(
        #set_prompt = set_prompt,
        id = ..1,
        text = ..2,
        temperature = 0,
        model_name = set_claude_mod
      ),
      .progress = TRUE
    )
  ) %>%
  unnest_wider(result)

results_train_gpt <- par_train_prompts %>%
  select(CaseId, prompt_B, ends_with("_bin"), ends_with("_truth")) |>
  mutate(
    result = future_pmap(
      list(
        id = CaseId,
        text = prompt_B
      ),
      ~ get_response_gpt(
        #set_prompt = set_prompt,
        id = ..1,
        text = ..2,
        temperature = 0,
        model_name = set_gpt_mod
      ),
      .progress = TRUE
    )
  ) %>%
  unnest_wider(result)

extract_answer <- function(text, pattern) {
  answer <- text[str_detect(text, pattern)]
  if (length(answer) > 0) {
    # Extract answers, trim whitespace, and remove trailing semicolon if present
    str_trim(str_remove(str_remove(answer[1], pattern), ";\\s*$"))
  } else {
    NA_character_
  }
}

# Assemble master dataframe
results_train <- bind_rows(results_train_claude %>% 
                             mutate(model = set_claude_mod),
                           results_train_gpt %>%
                             mutate(model = set_gpt_mod),)

results_train_parsed <- results_train %>%
  mutate(
    # Remove any leading/trailing whitespace and split by newlines
    split_response = str_split(str_trim(llm_response), "\n")
  ) %>%
  rowwise() %>%
  mutate(
    Q1 = extract_answer(split_response, "^Q1:|^1\\.?\\s*"),
    Q1b = extract_answer(split_response, "^Q1b:|^1b\\.?\\s*"),
    Q2 = extract_answer(split_response, "^Q2:|^2\\.?\\s*"),
    Q2b = extract_answer(split_response, "^Q2b:|^2b\\.?\\s*")
  ) %>%
  ungroup() %>%
  select(-split_response) %>%
  rename(
    salience_target_prediction = Q1,
    valence_target_prediction = Q1b,
    salience_impersonal_prediction  = Q2,
    valence_impersonal_prediction = Q2b,
    
  ) %>%
  mutate(valence_impersonal_prediction = 
           str_remove(valence_impersonal_prediction, "\\.$")) %>%
  left_join(df_paronly %>%
              select(CaseId, derived_binary_freeresp), 
            by = "CaseId") %>%
  mutate(across(contains("valence"), 
                ~ ifelse(str_detect(., "^N/A"), "N/A", .))) %>%
  mutate(across(contains("prediction"), 
                ~ str_replace_all(., "[\\{\\}]", ""))) %>%
  mutate(across(contains("salience"), ~ factor(.x, levels = c(1, 0)))) 

################################
# Final prep and store results #
################################

# Final cleanup of output
results_train_out <- results_train_parsed %>%
  select(-contains("truth"), -inadequate_response_tocode_bin) %>%
  left_join(par_joined_s1, by = "CaseId") %>%
  mutate(across(contains("valence"), ~ ifelse(str_detect(., "^N/A"), "N/A", .)))

# Store LLM coded entries 
# Commented out; replication package contains the responses we received
# saveRDS(results_train_out, here("data/llm_results_train.RDS"))

#######################
## Apply to test set ##
#######################

# Run API calls on Claude
results_test_claude <- par_test_prompts %>%
  select(CaseId, prompt_B, ends_with("_bin"), ends_with("_truth")) |>
  mutate(
    result = future_pmap(
      list(
        id = CaseId,
        text = prompt_B
      ),
      ~ get_response_claude(
        id = ..1,
        text = ..2,
        temperature = 0,
        model_name = set_claude_mod
      ),
      .progress = TRUE
    )
  ) %>%
  unnest_wider(result)

results_test_parsed <- results_test_claude %>%
  mutate(model = set_claude_mod) %>%
  mutate(
    # Remove any leading/trailing whitespace and split by newlines
    split_response = str_split(str_trim(llm_response), "\n")
  ) %>%
  rowwise() %>%
  mutate(
    Q1 = extract_answer(split_response, "^Q1:|^1\\.?\\s*"),
    Q1b = extract_answer(split_response, "^Q1b:|^1b\\.?\\s*"),
    Q2 = extract_answer(split_response, "^Q2:|^2\\.?\\s*"),
    Q2b = extract_answer(split_response, "^Q2b:|^2b\\.?\\s*")
  ) %>%
  ungroup() %>%
  select(-split_response) %>%
  rename(
    salience_target_prediction = Q1,
    valence_target_prediction = Q1b,
    salience_impersonal_prediction  = Q2,
    valence_impersonal_prediction = Q2b,
    
  ) %>%
  mutate(valence_impersonal_prediction = 
           str_remove(valence_impersonal_prediction, "\\.$")) %>%
  left_join(df_paronly %>%
              select(CaseId, derived_binary_freeresp), 
            by = "CaseId") %>%
  mutate(across(contains("valence"), ~ 
                  ifelse(str_detect(., "^N/A"), "N/A", .))) %>%
  mutate(across(contains("prediction"), 
                ~ str_replace_all(., "[\\{\\}]", ""))) %>%
  mutate(across(contains("salience"), ~ factor(.x, levels = c(1, 0)))) 

results_test_out <- results_test_parsed %>%
  select(-contains("truth"), -inadequate_response_tocode_bin) %>%
  left_join(par_joined_s1, by = "CaseId") %>%
  mutate(across(contains("valence"), ~ ifelse(str_detect(., "^N/A"), "N/A", .)))

# Store LLM coded entries 
# Commented out; replication package contains the responses we received
# saveRDS(results_test_out, here("data/llm_results_test.RDS"))

###############################
## Apply LLM to whole sample ##
###############################

# Apply Claude to unlabeled sample 
# Prep data to be consistently formatted 
df_paronly_unlabeled <- df_paronly %>%
  # Apply only to observations not in training or test 
  anti_join(combined_qual_valid, by = "CaseId") %>%
  mutate(limited_cell = str_replace(derived_cell, "^(([^_]*_[^_]*))_.*$", "\\1")) %>%
  select(CaseId, derived_compcheck, derived_binary_freeresp, 
       limited_cell) %>%
  # reformat some variables for easier evaluation later
  separate(limited_cell, into = c("status_quo", "choice"), sep = "_") %>%
  mutate(status_quo_text = case_when(status_quo == "Counselor discretion" ~ "the school counselor's judgment",
                                     status_quo == "Parent requests" ~ "parent requests as they are made",
                                     status_quo == "Simple rule" ~ "a test score and family income cutoff",
                                     status_quo == "Weighted lottery" ~ "a lottery with higher odds for some students")) %>%
  mutate(preference_desc = ifelse(choice == "algfairer", paste0("algorithms are fairer than ", status_quo_text),
                                  paste0("algorithms are less fair than ", status_quo_text))) %>%
  mutate(preferred_option = ifelse(choice == "algfairer", "a predictive model", status_quo_text),
         notpreferred_option = ifelse(choice == "algfairer", status_quo_text, "a predictive model")) 

colSums(is.na(df_paronly_unlabeled))

df_paronly_tocode <- df_paronly_unlabeled %>%
  # Filter out observations with no free response answer (removes 58 observations) 
  filter(!is.na(derived_binary_freeresp)) %>%
  # Also filter out any answers with <6 characters or containing no letters (28 observations) 
  filter(!str_detect(derived_binary_freeresp, "^(?:.{0,6}|[^a-zA-Z]+)$")) %>%
  # Generate the observation's prompt
   mutate(prompt_B = paste(set_promptB_1, tolower(preference_desc), 
                           set_promptB_2, preferred_option,
                   set_promptB_2b, notpreferred_option, set_promptB_3,
                   "<survey response>", derived_binary_freeresp,
                   "</survey response>"),
          nwords = qdap::word_count(prompt_B))

colSums(is.na(df_paronly_tocode))

# Run API calls on whole sample
# Costs around $15 
results_unlab_claude <- df_paronly_tocode %>%
  select(CaseId, prompt_B, ends_with("_bin")) |>
  mutate(
    result = future_pmap(
      list(
        id = CaseId,
        text = prompt_B
      ),
      ~ get_response_claude(
        id = ..1,
        text = ..2,
        temperature = 0,
        model_name = set_claude_mod
      ),
      .progress = TRUE
    )
  ) %>%
  unnest_wider(result)

results_unlab_out_prelim <- results_unlab_claude %>%
  mutate(model = set_claude_mod) %>%
  mutate(
    # Remove any leading/trailing whitespace and split by newlines
    split_response = str_split(str_trim(llm_response), "\n")
  ) %>%
  rowwise() %>%
  mutate(
    Q1 = extract_answer(split_response, "^Q1:|^1\\.?\\s*"),
    Q1b = extract_answer(split_response, "^Q1b:|^1b\\.?\\s*"),
    Q2 = extract_answer(split_response, "^Q2:|^2\\.?\\s*"),
    Q2b = extract_answer(split_response, "^Q2b:|^2b\\.?\\s*")
  ) %>%
  ungroup() %>%
  select(-split_response) %>%
  rename(
    salience_target_prediction = Q1,
    valence_target_prediction = Q1b,
    salience_impersonal_prediction  = Q2,
    valence_impersonal_prediction = Q2b,
    
  ) %>%
  mutate(valence_impersonal_prediction = 
           str_remove(valence_impersonal_prediction, "\\.$")) %>%
  left_join(df_paronly %>%
              select(CaseId, derived_binary_freeresp), 
            by = "CaseId") %>%
  mutate(across(contains("valence"), ~ 
                  ifelse(str_detect(., "^N/A"), "N/A", .))) %>%
  mutate(across(contains("prediction"), 
                ~ str_replace_all(., "[\\{\\}]", ""))) %>%
  mutate(across(contains("salience"), ~ factor(.x, levels = c(1, 0)))) %>%
  mutate(across(contains("valence"), ~ ifelse(str_detect(., "^N/A"), "N/A", .)))

colSums(is.na(results_unlab_out_prelim))

# Check over one response with no LLM response
# answer referenced answer to comprehension check
# going to feed LLM that instead
temp <- results_unlab_out_prelim %>%
  filter(is.na(salience_target_prediction))

# Repair observation that didn't produce a result: LLM didn't respond
results_unlab_claude_patch <- df_paronly_tocode %>%
  filter(CaseId == "16632") %>%
  mutate(prompt_B = paste(set_promptB_1, 
                          tolower(preference_desc), set_promptB_2, 
                          preferred_option, set_promptB_2b, notpreferred_option, 
                          set_promptB_3,
                          "<survey response>", derived_compcheck,
                          "</survey response>")) %>%
  select(CaseId, prompt_B, ends_with("_bin")) |>
  mutate(
    result = future_pmap(
      list(
        id = CaseId,
        text = prompt_B
      ),
      ~ get_response_claude(
        #set_prompt = set_prompt,
        id = ..1,
        text = ..2,
        temperature = 0,
        model_name = set_claude_mod
      ),
      .progress = TRUE
    )
  ) %>%
  unnest_wider(result) %>%
  mutate(
    # Remove any leading/trailing whitespace and split by newlines
    split_response = str_split(str_trim(llm_response), "\n")
  ) %>%
  rowwise() %>%
  mutate(
    Q1 = extract_answer(split_response, "^Q1:|^1\\.?\\s*"),
    Q1b = extract_answer(split_response, "^Q1b:|^1b\\.?\\s*"),
    Q2 = extract_answer(split_response, "^Q2:|^2\\.?\\s*"),
    Q2b = extract_answer(split_response, "^Q2b:|^2b\\.?\\s*")
  ) %>%
  ungroup() %>%
  select(-split_response) %>%
  rename(
    salience_target_prediction = Q1,
    valence_target_prediction = Q1b,
    salience_impersonal_prediction  = Q2,
    valence_impersonal_prediction = Q2b,
  ) %>%
  mutate(valence_impersonal_prediction = 
           str_remove(valence_impersonal_prediction, "\\.$")) %>%
  mutate(across(contains("prediction"), 
                ~ str_replace_all(., "[\\{\\}]", ""))) 

results_unlab_out <- results_unlab_out_prelim  %>%
  filter(CaseId != "16632") %>%
  bind_rows(results_unlab_claude_patch)

# Final checks
nrow(results_unlab_out) == nrow(df_paronly_tocode)
colSums(is.na(results_unlab_out))

# Store LLM coded entries 
# Commented out; replication package contains the responses we received
# saveRDS(results_unlab_out, here("data/llm_results_unlab.RDS"))